Data visualization of viral contigs prediction by VirSorter, VirSorter2, DeepVirFinder¶
In [1]:
from math import ceil
import os
import glob
from os.path import join
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [2]:
# Set seaborn plotting aesthetics as default
sns.set()
Virsorter pre-processing¶
In [3]:
virsorter_list = glob.glob("virsorter/*/VIRSorter_global-phage-signal.csv")
len(virsorter_list)
Out[3]:
123
In [4]:
def read_tsv_file(fp):
try:
# Attempt to read the TSV file
df = pd.read_csv(fp, comment="#", sep=",", header=None)
# Add a new column with the directory name
return df
except pd.errors.EmptyDataError:
# Determine the number of columns by checking the header of another file
sample_df = pd.read_csv(virsorter_list[0], comment="#", sep=",", header=None, nrows=1)
columns = ["Contig_id", "Nb genes contigs", "Fragment", "Nb genes", "Category", "Nb phage hallmark genes", "Phage gene enrichment sig", "Non-Caudovirales phage gene enrichment sig", "Pfam depletion sig", "Uncharacterized enrichment sig", "Strand switch depletion sig", "Short genes enrichment sig"]
# Create a DataFrame with null values for each column
empty_df = pd.DataFrame([["null"] * 12])
return empty_df
In [5]:
df_virsorter = pd.concat([read_tsv_file(fp).assign(New=os.path.basename(os.path.dirname(fp))) for fp in virsorter_list])
df_virsorter.columns = ["Contig_id", "Nb genes contigs", "Fragment", "Nb genes", "Category", "Nb phage hallmark genes", "Phage gene enrichment sig", "Non-Caudovirales phage gene enrichment sig", "Pfam depletion sig", "Uncharacterized enrichment sig", "Strand switch depletion sig", "Short genes enrichment sig", "Sample_name"]
sample_name = df_virsorter.pop("Sample_name")
df_virsorter.insert(0,"Sample_name", sample_name)
df_virsorter['Contig_id'] = df_virsorter['Contig_id'].str.replace('VIRSorter_', '', regex=False)
df_virsorter['Contig_id'] = df_virsorter['Contig_id'].str.replace('-circular', '', regex=False)
df_virsorter['length'] = df_virsorter['Contig_id'].str.split('_').str[3]
df_virsorter['length'] = df_virsorter['length'].str.replace(' ', '', regex=False)
df_virsorter['length'] = df_virsorter['length'].astype(float)
df_virsorter = df_virsorter.loc[df_virsorter['length'] >= 1000 ]
df_virsorter
Out[5]:
| Sample_name | Contig_id | Nb genes contigs | Fragment | Nb genes | Category | Nb phage hallmark genes | Phage gene enrichment sig | Non-Caudovirales phage gene enrichment sig | Pfam depletion sig | Uncharacterized enrichment sig | Strand switch depletion sig | Short genes enrichment sig | length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BER_0XMTAHIFY | NODE_11015_length_1722_cov_4_046191 | 5 | VIRSorter_NODE_11015_length_1722_cov_4_046191 | 5 | 1 | 1.0 | 2.41026382682884 | NaN | 2.96905455092977 | NaN | NaN | NaN | 1722.0 |
| 1 | BER_0XMTAHIFY | NODE_14423_length_1447_cov_3_596983 | 4 | VIRSorter_NODE_14423_length_1447_cov_3_596983 | 4 | 1 | 1.0 | 3.04440229453496 | NaN | 2.37524364074382 | NaN | NaN | NaN | 1447.0 |
| 2 | BER_0XMTAHIFY | NODE_15705_length_1371_cov_2_724164 | 3 | VIRSorter_NODE_15705_length_1371_cov_2_724164 | 3 | 1 | 1.0 | 2.28330172090122 | NaN | NaN | NaN | NaN | NaN | 1371.0 |
| 3 | BER_0XMTAHIFY | NODE_2110_length_5594_cov_7_496480 | 8 | VIRSorter_NODE_2110_length_5594_cov_7_496480 | 8 | 1 | 3.0 | 6.08880458906991 | NaN | 4.75048728148764 | NaN | NaN | NaN | 5594.0 |
| 4 | BER_0XMTAHIFY | NODE_2979_length_4285_cov_4_086525 | 8 | VIRSorter_NODE_2979_length_4285_cov_4_086525 | 8 | 1 | 1.0 | 2.25950718871954 | NaN | 3.36313445460326 | NaN | NaN | NaN | 4285.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 275 | TOK_XV92PXO2D | NODE_58_length_84686_cov_7_783460 | 86 | VIRSorter_NODE_58_length_84686_cov_7_783460-ge... | 32 | 3 | NaN | NaN | NaN | gene_48-gene_79:7.37854174991945 | gene_52-gene_75:4.47459491092940 | NaN | NaN | 84686.0 |
| 1 | Unknown_YSOUSNDN9 | NODE_150_length_1622_cov_3_324825 | 7 | VIRSorter_NODE_150_length_1622_cov_3_324825 | 7 | 2 | NaN | 3.735218 | NaN | NaN | NaN | NaN | NaN | 1622.0 |
| 3 | Unknown_YSOUSNDN9 | NODE_253_length_1439_cov_3_143064 | 6 | VIRSorter_NODE_253_length_1439_cov_3_143064 | 6 | 2 | NaN | 3.201616 | NaN | NaN | NaN | NaN | NaN | 1439.0 |
| 5 | Unknown_YSOUSNDN9 | NODE_374_length_1301_cov_9_247191 | 4 | VIRSorter_NODE_374_length_1301_cov_9_247191 | 4 | 2 | NaN | 2.13441 | NaN | NaN | NaN | NaN | NaN | 1301.0 |
| 6 | Unknown_YSOUSNDN9 | NODE_652_length_1108_cov_21_948718 | 5 | VIRSorter_NODE_652_length_1108_cov_21_948718 | 5 | 2 | NaN | 2.668013 | NaN | NaN | NaN | NaN | NaN | 1108.0 |
11462 rows × 14 columns
In [6]:
df_virsorter_counted = df_virsorter.groupby(["Sample_name"]).count()
df_virsorter_counted = df_virsorter_counted[["Contig_id"]]
df_virsorter_counted.rename(columns={'Contig_id': 'Contig_id_VirSorter'}, inplace= True)
df_virsorter_counted.reset_index(inplace=True)
df_virsorter.groupby(["Sample_name"]).count()
Out[6]:
| Contig_id | Nb genes contigs | Fragment | Nb genes | Category | Nb phage hallmark genes | Phage gene enrichment sig | Non-Caudovirales phage gene enrichment sig | Pfam depletion sig | Uncharacterized enrichment sig | Strand switch depletion sig | Short genes enrichment sig | length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sample_name | |||||||||||||
| BER_0XMTAHIFY | 155 | 155 | 155 | 155 | 155 | 50 | 61 | 8 | 143 | 78 | 6 | 37 | 155 |
| BER_1FZ1M3LBT | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| BER_5UIO53M4H | 37 | 37 | 37 | 37 | 37 | 7 | 31 | 2 | 17 | 1 | 0 | 0 | 37 |
| BER_62VRQ43DO | 20 | 20 | 20 | 20 | 20 | 9 | 14 | 2 | 18 | 6 | 1 | 1 | 20 |
| BER_6LAJ3M790 | 2 | 2 | 2 | 2 | 2 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TOK_EZO4LULQ4 | 94 | 94 | 94 | 94 | 94 | 12 | 83 | 3 | 58 | 8 | 1 | 10 | 94 |
| TOK_N2P2DPKIY | 85 | 85 | 85 | 85 | 85 | 24 | 59 | 2 | 72 | 13 | 0 | 19 | 85 |
| TOK_SM080U360 | 64 | 64 | 64 | 64 | 64 | 4 | 64 | 0 | 0 | 0 | 0 | 3 | 64 |
| TOK_XV92PXO2D | 171 | 171 | 171 | 171 | 171 | 47 | 106 | 5 | 151 | 36 | 1 | 37 | 171 |
| Unknown_YSOUSNDN9 | 4 | 4 | 4 | 4 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 4 |
114 rows × 13 columns
VirSorter2¶
In [7]:
virsorter2_list = glob.glob("virsorter2/*/final-viral-score.tsv")
len(virsorter2_list)
Out[7]:
123
In [8]:
df_virsorter2 = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in virsorter2_list])
sample_name_v2 = df_virsorter2.pop("Sample_name")
df_virsorter2.insert(0,"Sample_name", sample_name_v2)
df_virsorter2[['seqname', 'type']] = df_virsorter2['seqname'].str.split(r'\|\|', expand=True)
df_virsorter2.rename(columns={'seqname': 'Contig_id'}, inplace= True)
df_virsorter2['Contig_id'] = df_virsorter2['Contig_id'].str.replace('.','_')
#df_virsorter2 = df_virsorter2.loc[df_virsorter2['max_score'] >= 0.9]
df_virsorter2 = df_virsorter2.loc[df_virsorter2['length'] >= 1000]
df_virsorter2
C:\Users\Olga\AppData\Local\Temp\ipykernel_31648\2783735064.py:1: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. df_virsorter2 = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in virsorter2_list])
Out[8]:
| Sample_name | Contig_id | dsDNAphage | NCLDV | RNA | ssDNA | lavidaviridae | max_score | max_score_group | length | hallmark | viral | cellular | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BER_0XMTAHIFY | NODE_53_length_176986_cov_28_951625 | 0.853 | 0.040 | 0.000 | 0.000 | 0.000 | 0.853 | dsDNAphage | 176555 | 6 | 30.1 | 16.5 | full |
| 1 | BER_0XMTAHIFY | NODE_106_length_100541_cov_12_450958 | 0.833 | 0.007 | 0.000 | 0.000 | 0.000 | 0.833 | dsDNAphage | 98010 | 1 | 24.6 | 22.9 | full |
| 2 | BER_0XMTAHIFY | NODE_171_length_60244_cov_8_122348 | 0.887 | 0.213 | 0.000 | 0.000 | 0.007 | 0.887 | dsDNAphage | 60132 | 9 | 28.4 | 27.0 | full |
| 3 | BER_0XMTAHIFY | NODE_324_length_29490_cov_8_063836 | 0.967 | 0.213 | 0.000 | 0.000 | 0.020 | 0.967 | dsDNAphage | 29488 | 1 | 38.9 | 2.8 | full |
| 4 | BER_0XMTAHIFY | NODE_370_length_25515_cov_17_079615 | 0.967 | 0.020 | 0.000 | 0.007 | 0.013 | 0.967 | dsDNAphage | 25513 | 11 | 54.5 | 36.4 | full |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 29 | Unknown_YSOUSNDN9 | NODE_882_length_1018_cov_2_137072 | 0.440 | NaN | 0.255 | 0.960 | 0.380 | 0.960 | ssDNA | 1018 | 0 | 33.3 | 0.0 | full |
| 30 | Unknown_YSOUSNDN9 | NODE_886_length_1016_cov_2_861602 | 0.333 | NaN | 0.340 | 0.093 | 0.527 | 0.527 | lavidaviridae | 1015 | 0 | 33.3 | 0.0 | full |
| 31 | Unknown_YSOUSNDN9 | NODE_163_length_1594_cov_3_375569 | NaN | NaN | NaN | NaN | NaN | NaN | dsDNAphage | 1594 | 1 | 100.0 | 0.0 | lt2gene |
| 32 | Unknown_YSOUSNDN9 | NODE_212_length_1493_cov_3_253129 | NaN | NaN | NaN | NaN | NaN | NaN | dsDNAphage | 1493 | 1 | 100.0 | 0.0 | lt2gene |
| 33 | Unknown_YSOUSNDN9 | NODE_900_length_1011_cov_2_092050 | NaN | NaN | NaN | NaN | NaN | NaN | dsDNAphage | 1011 | 1 | 100.0 | 0.0 | lt2gene |
87783 rows × 14 columns
In [9]:
df_virsorter2_counted = df_virsorter2.groupby(["Sample_name"]).count()
df_virsorter2_counted = df_virsorter2_counted[["Contig_id"]]
df_virsorter2_counted.rename(columns={'Contig_id': 'Contig_id_VirSorter2'}, inplace= True)
df_virsorter2_counted.reset_index(inplace=True)
df_virsorter2.groupby(["Sample_name"]).count()
Out[9]:
| Contig_id | dsDNAphage | NCLDV | RNA | ssDNA | lavidaviridae | max_score | max_score_group | length | hallmark | viral | cellular | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sample_name | |||||||||||||
| BER_0XMTAHIFY | 1176 | 1074 | 863 | 1074 | 1074 | 1074 | 1096 | 1176 | 1176 | 1176 | 1176 | 1176 | 1176 |
| BER_1FZ1M3LBT | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| BER_5UIO53M4H | 487 | 442 | 365 | 442 | 442 | 442 | 450 | 487 | 487 | 487 | 487 | 487 | 487 |
| BER_62VRQ43DO | 110 | 105 | 103 | 105 | 105 | 105 | 106 | 110 | 110 | 110 | 110 | 110 | 110 |
| BER_6LAJ3M790 | 18 | 17 | 16 | 17 | 17 | 17 | 18 | 18 | 18 | 18 | 18 | 18 | 18 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TOK_EZO4LULQ4 | 772 | 718 | 644 | 718 | 718 | 718 | 728 | 772 | 772 | 772 | 772 | 772 | 772 |
| TOK_N2P2DPKIY | 757 | 674 | 523 | 674 | 674 | 674 | 682 | 757 | 757 | 757 | 757 | 757 | 757 |
| TOK_SM080U360 | 318 | 287 | 252 | 287 | 287 | 287 | 295 | 318 | 318 | 318 | 318 | 318 | 318 |
| TOK_XV92PXO2D | 1394 | 1243 | 980 | 1243 | 1243 | 1243 | 1256 | 1394 | 1394 | 1394 | 1394 | 1394 | 1394 |
| Unknown_YSOUSNDN9 | 33 | 30 | 21 | 30 | 30 | 30 | 30 | 33 | 33 | 33 | 33 | 33 | 33 |
121 rows × 13 columns
DeepVirFinder¶
In [10]:
deepvirfinder_list = glob.glob("deepvirfinder/*/*dvfpred.txt")
len(deepvirfinder_list)
Out[10]:
123
In [11]:
df_deepvirfinder = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in deepvirfinder_list])
sample_name_dvf = df_deepvirfinder.pop("Sample_name")
df_deepvirfinder.insert(0,"Sample_name", sample_name_dvf)
df_deepvirfinder = df_deepvirfinder.rename(columns={'name': 'Contig_id'})
df_deepvirfinder['Contig_id'] = df_deepvirfinder['Contig_id'].str.replace('.','_')
df_deepvirfinder = df_deepvirfinder.loc[df_deepvirfinder['score'] >= 0.9] ## 10% error
df_deepvirfinder = df_deepvirfinder.loc[df_deepvirfinder['len'] >= 1000]
df_deepvirfinder
Out[11]:
| Sample_name | Contig_id | len | score | pvalue | |
|---|---|---|---|---|---|
| 40 | BER_0XMTAHIFY | NODE_42_length_194988_cov_8_780940 | 194988 | 0.910056 | 0.017033 |
| 65 | BER_0XMTAHIFY | NODE_2_length_796906_cov_246_742030 | 796906 | 0.908315 | 0.017109 |
| 149 | BER_0XMTAHIFY | NODE_150_length_66695_cov_8_224595 | 66695 | 0.949682 | 0.013672 |
| 165 | BER_0XMTAHIFY | NODE_154_length_66337_cov_8_275369 | 66337 | 0.929793 | 0.015579 |
| 166 | BER_0XMTAHIFY | NODE_170_length_60866_cov_8_550509 | 60866 | 0.933900 | 0.015258 |
| ... | ... | ... | ... | ... | ... |
| 804 | Unknown_YSOUSNDN9 | NODE_802_length_1049_cov_2_828974 | 1049 | 0.999941 | 0.001246 |
| 817 | Unknown_YSOUSNDN9 | NODE_820_length_1042_cov_2_851064 | 1042 | 0.999861 | 0.001888 |
| 854 | Unknown_YSOUSNDN9 | NODE_848_length_1031_cov_1_960041 | 1031 | 0.933558 | 0.015258 |
| 866 | Unknown_YSOUSNDN9 | NODE_868_length_1025_cov_1_929897 | 1025 | 1.000000 | 0.000000 |
| 888 | Unknown_YSOUSNDN9 | NODE_890_length_1014_cov_2_138686 | 1014 | 0.996957 | 0.005306 |
70478 rows × 5 columns
In [12]:
deepvirfinder_counted = df_deepvirfinder.groupby(["Sample_name"]).count()
deepvirfinder_counted = deepvirfinder_counted[["Contig_id"]]
deepvirfinder_counted.rename(columns={'Contig_id': 'Contig_id_DVF'}, inplace= True)
deepvirfinder_counted.reset_index(inplace=True)
deepvirfinder_counted
Out[12]:
| Sample_name | Contig_id_DVF | |
|---|---|---|
| 0 | BER_0XMTAHIFY | 679 |
| 1 | BER_1FZ1M3LBT | 13 |
| 2 | BER_5UIO53M4H | 1241 |
| 3 | BER_62VRQ43DO | 142 |
| 4 | BER_6LAJ3M790 | 45 |
| ... | ... | ... |
| 118 | TOK_EZO4LULQ4 | 531 |
| 119 | TOK_N2P2DPKIY | 694 |
| 120 | TOK_SM080U360 | 361 |
| 121 | TOK_XV92PXO2D | 1069 |
| 122 | Unknown_YSOUSNDN9 | 45 |
123 rows × 2 columns
Contig number comparison¶
Data preparation¶
In [13]:
merged_df_counted = df_virsorter_counted.merge(df_virsorter2_counted, on="Sample_name", how='outer').merge(deepvirfinder_counted, on = "Sample_name", how='outer')
merged_df_counted_vs_only = df_virsorter_counted.merge(df_virsorter2_counted, on="Sample_name")
vs = df_virsorter_counted["Sample_name"].to_list()
vs2 = df_virsorter2_counted["Sample_name"].to_list()
dvf = deepvirfinder_counted["Sample_name"].to_list()
In [14]:
merged_df_counted_melted = merged_df_counted.melt(id_vars=["Sample_name"], value_vars=['Contig_id_VirSorter','Contig_id_VirSorter2','Contig_id_DVF'], var_name="Tool", value_name="Nb. of contigs")
merged_df_counted_melted = merged_df_counted_melted.sort_values('Nb. of contigs', ascending=False)
merged_df_counted_melted_vs_only = merged_df_counted.melt(id_vars=["Sample_name"], value_vars=['Contig_id_VirSorter','Contig_id_VirSorter2'], var_name="Tool", value_name="Nb. of contigs")
merged_df_counted_melted_vs_only = merged_df_counted_melted_vs_only.sort_values('Nb. of contigs', ascending=False)
Data visualization¶
In [15]:
def grouped_bar_plot(df, a):
# Customize here: If there are multiple samples, add `order = samples_ordered` to the barplot command
p = sns.barplot(data = df, x = 'Sample_name', y = "Nb. of contigs", hue = 'Tool', \
palette = "colorblind", ax = a)
p.tick_params(axis='x', labelrotation=90, size = 5)
p.set_ylabel('Number of Contigs', size = 40)
p.set_xlabel('Samples Names', size = 40)
p.get_legend().remove()
return p
In [16]:
%%capture
fig, axs = plt.subplots(1, 1, figsize = (28,12)) # in inches
plt.xticks(size = 13)
plt.yticks(size = 25)
axs.set_title("Number of Viral contigs identified by each tool", size = 60, pad=30)
In [17]:
grouped_bar_plot(merged_df_counted_melted, axs)
Out[17]:
<Axes: title={'center': 'Number of Viral contigs identified by each tool'}, xlabel='Samples Names', ylabel='Number of Contigs'>
Compare number of contigs identified as viral by each tool
In [18]:
# Add a global legend referring to all and plot the graph
handles, labels = axs.get_legend_handles_labels()
fig.legend(handles, ["DeepVirFinder","VirSorter2","VirSorter"], loc = 'center right', fontsize = 40)
fig
Out[18]:
In [19]:
%%capture
fig1, axs1 = plt.subplots(1, 1, figsize = (28,12)) # in inches
plt.xticks(size = 13)
plt.yticks(size = 25)
axs1.set_title("Number of Viral contigs identified by VirSorter and VirSorter2", size = 60, pad=30)
In [20]:
grouped_bar_plot(merged_df_counted_melted_vs_only, axs1)
Out[20]:
<Axes: title={'center': 'Number of Viral contigs identified by VirSorter and VirSorter2'}, xlabel='Samples Names', ylabel='Number of Contigs'>
Do the same comparison but only for VirSOrter and VirSorter2
In [21]:
# Add a global legend referring to all and plot the graph
handles, labels = axs1.get_legend_handles_labels()
fig1.legend(handles, ["VirSorter2","VirSorter"], loc = 'center right', fontsize = 40)
fig1
Out[21]:
Compare viral contigs identified by each tool¶
In [22]:
import pandas as pd
from matplotlib_venn import venn3, venn3_circles
import matplotlib.pyplot as plt
plot Venn diagram for VirSorter VirSSorter2 and DeepVirFinder
df_virsorter df_virsorter2 df_deepvirfinder
In [23]:
virsorter_contigs = set(df_virsorter["Contig_id"])
virsorter2_contigs = set(df_virsorter2["Contig_id"])
dvf_contigs = set(df_deepvirfinder["Contig_id"])
In [24]:
plt.figure(figsize=(10, 7))
venn3([virsorter_contigs, virsorter2_contigs,dvf_contigs], ('VirSorter', 'VirSorter2', 'DeepVirFinder'), set_colors=("#0173b2", "#de8f05", "#029e73" ), alpha=0.6)
venn3_circles(subsets=(virsorter_contigs, virsorter2_contigs,dvf_contigs), linewidth=0.5)
plt.title('Venn Diagram of commonalities and differencies between pipelines results', size = 20, pad=30)
plt.show()
Vien diagram for each sample seaprately¶
In [25]:
df_virsorter['tool'] = 'VirSorter'
df_virsorter2['tool'] = 'VirSorter2'
df_deepvirfinder['tool'] = 'DVF'
combined_df = pd.concat([df_virsorter, df_virsorter2, df_deepvirfinder])
# Get the list of unique samples
sample_names = combined_df['Sample_name'].unique()
# Function to plot Venn diagram for each sample
def plot_venn_for_sample(sample):
sample_data = combined_df[combined_df['Sample_name'] == sample]
contig_ids_1 = set(sample_data[sample_data['tool'] == 'VirSorter']['Contig_id'])
contig_ids_2 = set(sample_data[sample_data['tool'] == 'VirSorter2']['Contig_id'])
contig_ids_3 = set(sample_data[sample_data['tool'] == 'DVF']['Contig_id'])
plt.figure(figsize=(10, 7))
venn3([contig_ids_1, contig_ids_2, contig_ids_3], ('VirSorter', 'VirSorter2', 'DVF'), set_colors=("#0173b2", "#de8f05", "#029e73" ))
plt.title(f'Venn Diagram for Sample {sample}')
plt.show()
# Plot Venn diagram for each unique sample
for sample in sample_names:
plot_venn_for_sample(sample)
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:169: UserWarning: Bad circle positioning.
warnings.warn("Bad circle positioning.")
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:103: UserWarning: Circle A has zero area.
warnings.warn("Circle A has zero area.")
C:\Users\Olga\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\matplotlib_venn\layout\venn3\pairwise.py:107: UserWarning: Circle B has zero area.
warnings.warn("Circle B has zero area.")
heatmap¶
Load number of all contigs
Take Sample name and AMR value
In [26]:
df_amr = pd.read_csv("../metadata.csv", sep=",")
df_amr = df_amr[["Sample_name","Sum_RPKM"]].copy()
In [ ]:
Load data. As DVF saves the result for each contig, its dataframe was used to retrieve all contigs name form the dataset
In [27]:
all_contigs = pd.concat([pd.read_csv(fp, sep="\t").assign(Sample_name=os.path.basename(os.path.dirname(fp))) for fp in deepvirfinder_list])
sample_name_dvf = all_contigs.pop("Sample_name")
all_contigs.insert(0,"Sample_name", sample_name_dvf)
all_contigs = all_contigs.rename(columns={'name': 'Contig_id'})
all_contigs['Contig_id'] = all_contigs['Contig_id'].str.replace('.','_')
all_contigs['contig_id_all'] = all_contigs['Contig_id']
all_contigs = all_contigs.loc[all_contigs['len'] >= 1000]
all_contigs
Out[27]:
| Sample_name | Contig_id | len | score | pvalue | contig_id_all | |
|---|---|---|---|---|---|---|
| 0 | BER_0XMTAHIFY | NODE_25_length_283529_cov_12_214824 | 283529 | 3.357433e-01 | 0.111054 | NODE_25_length_283529_cov_12_214824 |
| 1 | BER_0XMTAHIFY | NODE_22_length_301175_cov_14_146377 | 301175 | 5.025283e-01 | 0.060258 | NODE_22_length_301175_cov_14_146377 |
| 2 | BER_0XMTAHIFY | NODE_24_length_291638_cov_77_923847 | 291638 | 4.616548e-01 | 0.080520 | NODE_24_length_291638_cov_77_923847 |
| 3 | BER_0XMTAHIFY | NODE_23_length_292037_cov_32_250947 | 292037 | 6.629067e-01 | 0.036124 | NODE_23_length_292037_cov_32_250947 |
| 4 | BER_0XMTAHIFY | NODE_21_length_307159_cov_86_194820 | 307159 | 6.482564e-01 | 0.037257 | NODE_21_length_307159_cov_86_194820 |
| ... | ... | ... | ... | ... | ... | ... |
| 929 | Unknown_YSOUSNDN9 | NODE_931_length_1001_cov_1_978858 | 1001 | 1.618433e-01 | 0.160208 | NODE_931_length_1001_cov_1_978858 |
| 930 | Unknown_YSOUSNDN9 | NODE_910_length_1007_cov_2_834034 | 1007 | 6.921884e-01 | 0.033783 | NODE_910_length_1007_cov_2_834034 |
| 931 | Unknown_YSOUSNDN9 | NODE_932_length_1000_cov_2_369312 | 1000 | 8.186931e-10 | 0.985403 | NODE_932_length_1000_cov_2_369312 |
| 932 | Unknown_YSOUSNDN9 | NODE_929_length_1001_cov_2_540169 | 1001 | 2.468593e-03 | 0.444520 | NODE_929_length_1001_cov_2_540169 |
| 933 | Unknown_YSOUSNDN9 | NODE_928_length_1001_cov_2_651163 | 1001 | 5.650779e-01 | 0.046359 | NODE_928_length_1001_cov_2_651163 |
1849893 rows × 6 columns
In [28]:
all_counted = all_contigs.groupby(["Sample_name"]).count()
all_counted = all_counted[["contig_id_all"]]
all_counted.reset_index(inplace=True)
all_counted
Out[28]:
| Sample_name | contig_id_all | |
|---|---|---|
| 0 | BER_0XMTAHIFY | 25272 |
| 1 | BER_1FZ1M3LBT | 73 |
| 2 | BER_5UIO53M4H | 15502 |
| 3 | BER_62VRQ43DO | 2175 |
| 4 | BER_6LAJ3M790 | 675 |
| ... | ... | ... |
| 118 | TOK_EZO4LULQ4 | 13595 |
| 119 | TOK_N2P2DPKIY | 14386 |
| 120 | TOK_SM080U360 | 39740 |
| 121 | TOK_XV92PXO2D | 28325 |
| 122 | Unknown_YSOUSNDN9 | 934 |
123 rows × 2 columns
load initial df
In [29]:
deepvirfinder_counted
df_virsorter_counted
df_virsorter2_counted
Out[29]:
| Sample_name | Contig_id_VirSorter2 | |
|---|---|---|
| 0 | BER_0XMTAHIFY | 1176 |
| 1 | BER_1FZ1M3LBT | 2 |
| 2 | BER_5UIO53M4H | 487 |
| 3 | BER_62VRQ43DO | 110 |
| 4 | BER_6LAJ3M790 | 18 |
| ... | ... | ... |
| 116 | TOK_EZO4LULQ4 | 772 |
| 117 | TOK_N2P2DPKIY | 757 |
| 118 | TOK_SM080U360 | 318 |
| 119 | TOK_XV92PXO2D | 1394 |
| 120 | Unknown_YSOUSNDN9 | 33 |
121 rows × 2 columns
merge all_contigs column¶
Merge all contigs column and sort it based on Sum_RPKM values
In [30]:
deepvirfinder_rpkm = deepvirfinder_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
virsorter_rpkm = df_virsorter_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
virsorter2_rpkm = df_virsorter2_counted.merge(all_counted, on="Sample_name", how="outer").merge(df_amr, on="Sample_name").sort_values("Sum_RPKM", ascending= False).fillna(0)
In [31]:
deepvirfinder_rpkm["DeepVirFinder_density"] = deepvirfinder_rpkm["Contig_id_DVF"]/deepvirfinder_rpkm["contig_id_all"]
virsorter_rpkm["VirSorter_density"] = virsorter_rpkm["Contig_id_VirSorter"]/virsorter_rpkm["contig_id_all"]
virsorter2_rpkm["VirSorter2_density"] = virsorter2_rpkm["Contig_id_VirSorter2"]/virsorter2_rpkm["contig_id_all"]
In [32]:
virsorter2_rpkm_indexed = virsorter2_rpkm.set_index(virsorter2_rpkm["Sample_name"])
virsorter_rpkm_indexed = virsorter_rpkm.set_index(virsorter_rpkm["Sample_name"])
deepvirfinder_rpkm_indexed = deepvirfinder_rpkm.set_index(deepvirfinder_rpkm["Sample_name"])
In [33]:
deepvirfinder_rpkm_indexed
Out[33]:
| Sample_name | Contig_id_DVF | contig_id_all | Sum_RPKM | DeepVirFinder_density | |
|---|---|---|---|---|---|
| Sample_name | |||||
| ILR_U6M78A4O4 | ILR_U6M78A4O4 | 302 | 15884 | 1663.135815 | 0.019013 |
| Other_B8T9MC9HX | Other_B8T9MC9HX | 11 | 39 | 1251.715658 | 0.282051 |
| ILR_5O6P46B1O | ILR_5O6P46B1O | 1520 | 54313 | 1251.715658 | 0.027986 |
| NYC_0OSAG4JMN | NYC_0OSAG4JMN | 326 | 13305 | 1164.001298 | 0.024502 |
| ILR_YGMN39LS1 | ILR_YGMN39LS1 | 806 | 33652 | 790.184125 | 0.023951 |
| ... | ... | ... | ... | ... | ... |
| Other_2MFW9YKGQ | Other_2MFW9YKGQ | 7 | 81 | 2.594936 | 0.086420 |
| Other_RGFMTG8DA | Other_RGFMTG8DA | 52 | 274 | 2.432254 | 0.189781 |
| DOH_Z97N7FJ2K | DOH_Z97N7FJ2K | 18 | 212 | 1.722112 | 0.084906 |
| BER_6LAJ3M790 | BER_6LAJ3M790 | 45 | 675 | 1.530793 | 0.066667 |
| Other_34XFCIUBK | Other_34XFCIUBK | 31 | 495 | 1.337777 | 0.062626 |
123 rows × 5 columns
In [34]:
df_original_density = virsorter_rpkm_indexed[["VirSorter_density"]].merge(virsorter2_rpkm_indexed[["VirSorter2_density"]], on= "Sample_name").merge(deepvirfinder_rpkm_indexed[["DeepVirFinder_density"]], on = "Sample_name")
In [35]:
df_original_density
Out[35]:
| VirSorter_density | VirSorter2_density | DeepVirFinder_density | |
|---|---|---|---|
| Sample_name | |||
| ILR_U6M78A4O4 | 0.012465 | 0.050932 | 0.019013 |
| Other_B8T9MC9HX | 0.000000 | 0.358974 | 0.282051 |
| ILR_5O6P46B1O | 0.006812 | 0.056837 | 0.027986 |
| NYC_0OSAG4JMN | 0.008568 | 0.057121 | 0.024502 |
| ILR_YGMN39LS1 | 0.011946 | 0.058243 | 0.023951 |
| ... | ... | ... | ... |
| Other_2MFW9YKGQ | 0.000000 | 0.024691 | 0.086420 |
| Other_RGFMTG8DA | 0.003650 | 0.051095 | 0.189781 |
| DOH_Z97N7FJ2K | 0.018868 | 0.051887 | 0.084906 |
| BER_6LAJ3M790 | 0.002963 | 0.026667 | 0.066667 |
| Other_34XFCIUBK | 0.000000 | 0.024242 | 0.062626 |
123 rows × 3 columns
In [36]:
cb = sns.color_palette("viridis", 2000)
In [37]:
fig2, axs2 = plt.subplots(1, 1, figsize = (40,40))
plt.xticks(size = 50)
axs2.set_title("Viral density per sample", size = 60, pad=30)
p2 = sns.heatmap(df_original_density, linewidth=.5, cmap = cb, vmax=0.15)
p2.set_ylabel("Sample", size=40)
p2
Out[37]:
<Axes: title={'center': 'Viral density per sample'}, ylabel='Sample'>